library(xgboost)
library(randomForest)
library(tidyverse)
library(lubridate)
source('functions.r')
load("Table_construction.Rdata")
### Add useful columns to features and apply row filters used for all models
features_filt = features %>%
inner_join(
data_before %>%
select(person_id, screening_date, people) %>%
unnest() %>%
select(person_id, screening_date, race, sex, name),
by = c("person_id","screening_date")
) %>%
inner_join(features_on, by = c("person_id","screening_date")) %>%
inner_join(outcomes, by = c("person_id","screening_date")) %>%
filter(`Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1) %>% # Filter 1
filter(!is.na(current_offense_date)) %>% # Filter 3
mutate(p_recid_raw = `Risk of Recidivism_raw_score`,
age_poly = 0.000492285131636128000*p_current_age^2 - 0.0775341320826139000*p_current_age + 0.0305011936304372000,
p_recid_raw_noage = p_recid_raw - age_poly)
## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
eval_metric = "rmse",
eta = c(.05,.1),
gamma = c(.5, 1),
max_depth = c(2,5),
min_child_weight = c(5,10),
subsample = c(1),
colsample_bytree = c(1)
)
# svm
param_svm = list(
type = 'eps-regression',
cost = c(0.5,1,2),
epsilon = c(0.5,1,1.5),
gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:5, lm = NA, xgb = NA, rf = NA, svm = NA)
## Age polynomial
features_filt %>%
ggplot()+
geom_point(aes(x=p_current_age, p_recid_raw,color="b"), alpha=.3) +
geom_line(aes(x=p_current_age, age_poly,color="a")) +
theme_bw()+
xlim(18,70)+
xlab("Age at COMPAS screening date") +
ylab("COMPAS general raw") +
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="none")
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_path).
ggsave("Figures/age_agePoly_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_path).
### Number of priors vs. COMPAS remainder
ggplot(data=features_filt) +
geom_point(aes(x=p_charge, y=p_recid_raw_noage), alpha=.3)+
theme_bw() +
xlab("Number of prior charges") +
ylab("COMPAS general raw remainder") +
theme(
text = element_text(size=12),
axis.text=element_text(size=12))+
xlim(0,60)
## Warning: Removed 11 rows containing missing values (geom_point).
ggsave("Figures/priors_rawScoreRemain_general.pdf",width = 3.5, height = 3.5, units = "in")
## Warning: Removed 11 rows containing missing values (geom_point).
propub = features_filt %>%
filter(screening_date <= current_offense_date_limit) %>% # Only people with valid recidivism values
mutate(age_low = if_else(p_current_age < 25,1,0),
age_high = if_else(p_current_age > 45,1,0),
female = if_else(sex=="Female",1,0),
n_priors = p_felony_count_person + p_misdem_count_person,
compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
mdl_glm = glm(compas_high ~
female +
age_high +
age_low +
as.factor(race) +
p_charge +
is_misdem +
recid,
family=binomial(link='logit'), data=propub)
summary(mdl_glm)
##
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) +
## p_charge + is_misdem + recid, family = binomial(link = "logit"),
## data = propub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7558 -0.7641 -0.3055 0.8402 2.6712
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.604475 0.082621 -19.420 < 2e-16 ***
## female 0.129382 0.085337 1.516 0.1295
## age_high -1.485491 0.130168 -11.412 < 2e-16 ***
## age_low 1.443967 0.071469 20.204 < 2e-16 ***
## as.factor(race)African-American 0.522728 0.073118 7.149 8.73e-13 ***
## as.factor(race)Asian -0.270324 0.504164 -0.536 0.5918
## as.factor(race)Hispanic -0.307350 0.131763 -2.333 0.0197 *
## as.factor(race)Native American 0.387967 0.678561 0.572 0.5675
## as.factor(race)Other -0.720007 0.160307 -4.491 7.08e-06 ***
## p_charge 0.156196 0.006582 23.731 < 2e-16 ***
## is_misdem -0.449143 0.069824 -6.433 1.26e-10 ***
## recid 0.497513 0.068974 7.213 5.47e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7864.7 on 5726 degrees of freedom
## Residual deviance: 5633.5 on 5715 degrees of freedom
## AIC: 5657.5
##
## Number of Fisher Scoring iterations: 5
### Create group 1 training data
## Select features and round count features
train = features_filt %>%
transmute(
#p_current_age,
p_age_first_offense,
p_charge,
p_jail30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
p_recid_raw_noage)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
"label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)
mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.67940 -0.45225 -0.06694 0.37222 2.54441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0555467 0.0177396 59.50 <2e-16 ***
## p_age_first_offense -0.0067863 0.0005581 -12.16 <2e-16 ***
## p_charge 0.0256917 0.0010352 24.82 <2e-16 ***
## p_jail30 -0.0073008 0.0404757 -0.18 0.857
## p_prison 0.2084061 0.0085096 24.49 <2e-16 ***
## p_probation 0.1179271 0.0074736 15.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5759 on 9036 degrees of freedom
## Multiple R-squared: 0.4061, Adjusted R-squared: 0.4058
## F-statistic: 1236 on 5 and 9036 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 6
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage
res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("COMPAS remainder") +
ylab("xgboost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = p_recid_raw_noage ~ .,
data = train
)
res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP
mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## [1] "Best parameters:"
## 19
## type "eps-regression"
## cost "0.5"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.3333333"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
transmute(
#p_current_age,
p_age_first_offense,
p_charge,
p_jail30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
p_recid_raw_noage)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
"label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)
mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.50492 -0.43903 -0.06279 0.36448 2.39305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7336356 0.0298015 24.617 < 2e-16 ***
## p_age_first_offense -0.0048067 0.0005678 -8.466 < 2e-16 ***
## p_charge 0.0248022 0.0010184 24.355 < 2e-16 ***
## p_jail30 0.0065670 0.0397765 0.165 0.86887
## p_prison 0.1984432 0.0083906 23.651 < 2e-16 ***
## p_probation 0.1147516 0.0073457 15.622 < 2e-16 ***
## race_black 0.3692502 0.0257204 14.356 < 2e-16 ***
## race_white 0.2448093 0.0259935 9.418 < 2e-16 ***
## race_hispanic 0.0864109 0.0311533 2.774 0.00555 **
## race_asian 0.0919404 0.0859397 1.070 0.28473
## race_native 0.2592508 0.1096961 2.363 0.01813 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5657 on 9031 degrees of freedom
## Multiple R-squared: 0.4273, Adjusted R-squared: 0.4267
## F-statistic: 673.8 on 10 and 9031 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage
res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("COMPAS raw score remainder") +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
data.frame(xgboost = pred, compas=features_filt$p_recid_raw) %>%
ggplot() +
geom_point(aes(x=xgboost,y=compas), alpha=.3) +
theme_bw()+
xlab("XGBoost prediction") +
ylab("COMPAS raw score")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(6778)
mdl_rf = randomForest(
formula = p_recid_raw_noage ~ .,
data = train
)
res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP
mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## [1] "Best parameters:"
## 12
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.09090909"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
transmute(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
p_recid_raw_noage)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
"label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)
mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.26968 -0.44787 -0.07239 0.36897 2.55581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.014740 0.018166 55.858 <2e-16 ***
## p_current_age 0.011466 0.001206 9.510 <2e-16 ***
## p_age_first_offense -0.017645 0.001270 -13.897 <2e-16 ***
## p_charge 0.022968 0.001069 21.483 <2e-16 ***
## p_jail30 0.020908 0.040386 0.518 0.605
## p_prison 0.184507 0.008833 20.889 <2e-16 ***
## p_probation 0.096735 0.007764 12.460 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.573 on 9035 degrees of freedom
## Multiple R-squared: 0.412, Adjusted R-squared: 0.4116
## F-statistic: 1055 on 6 and 9035 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP
set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage
res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("COMPAS raw score remainder") +
ylab("xgboost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(5)
mdl_rf = randomForest(
formula = p_recid_raw_noage ~ .,
data = train
)
res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP
mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## [1] "Best parameters:"
## 11
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.1428571"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
transmute(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
p_recid_raw_noage)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
"label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)
mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.09334 -0.42968 -0.06403 0.35999 2.40135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.692957 0.029955 23.133 < 2e-16 ***
## p_current_age 0.011375 0.001187 9.583 < 2e-16 ***
## p_age_first_offense -0.015533 0.001254 -12.389 < 2e-16 ***
## p_charge 0.022096 0.001052 21.005 < 2e-16 ***
## p_jail30 0.034923 0.039688 0.880 0.37893
## p_prison 0.174521 0.008714 20.028 < 2e-16 ***
## p_probation 0.093816 0.007629 12.298 < 2e-16 ***
## race_black 0.369928 0.025592 14.455 < 2e-16 ***
## race_white 0.238926 0.025871 9.235 < 2e-16 ***
## race_hispanic 0.093102 0.031006 3.003 0.00268 **
## race_asian 0.100984 0.085516 1.181 0.23768
## race_native 0.247590 0.109155 2.268 0.02334 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5628 on 9030 degrees of freedom
## Multiple R-squared: 0.4331, Adjusted R-squared: 0.4324
## F-statistic: 627 on 11 and 9030 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP
set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage
res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("COMPAS general raw remainder") +
ylab("Prediction of COMPAS general raw remainder")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12))
ggsave("Figures/rawScoreRemain_xgboost_general.pdf",width = 4, height = 4, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
highlight = data.frame(
person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
highlight = TRUE
)
df_plot = features_filt %>%
bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
left_join(highlight, by = c("person_id","screening_date")) %>%
mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))
person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)
ggplot() +
geom_point(aes(x=xgboost,y=p_recid_raw, color=highlight), alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
geom_point(aes(x=xgboost,y=p_recid_raw, color=highlight), data = filter(df_plot, highlight=="In Table 5")) +
theme_bw()+
geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) +
xlab("Prediction of COMPAS general raw remainder") +
ylab("COMPAS general raw")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
#legend.position = "top",
legend.position="none") +
scale_color_discrete(name = element_blank()) +
xlim(0.2,3.5)
## Warning: Removed 2 rows containing missing values (geom_point).
ggsave("Figures/xgboost_rawScore_general.pdf",width = 4, height = 4, units = "in")
## Warning: Removed 2 rows containing missing values (geom_point).
set.seed(3720)
mdl_rf = randomForest(
formula = p_recid_raw_noage ~ .,
data = train
)
res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP
mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## [1] "Best parameters:"
## 20
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1666667"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 5 training data
## Select features and round count features
train = features_filt %>%
transmute(
p_current_age,
p_age_first_offense,
p_charge,
p_arrest,
p_jail30 = pmin(p_jail30,5),
p_prison30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
p_recid_raw_noage)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
"label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)
mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9676 -0.4323 -0.0629 0.3625 2.4067
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.695934 0.029936 23.247 < 2e-16 ***
## p_current_age 0.011899 0.001192 9.978 < 2e-16 ***
## p_age_first_offense -0.016087 0.001260 -12.772 < 2e-16 ***
## p_charge 0.014255 0.002143 6.651 3.08e-11 ***
## p_arrest 0.006877 0.001638 4.197 2.73e-05 ***
## p_jail30 0.023470 0.039746 0.591 0.5549
## p_prison30 NA NA NA NA
## p_prison 0.170361 0.008762 19.443 < 2e-16 ***
## p_probation 0.081617 0.008157 10.006 < 2e-16 ***
## race_black 0.371083 0.025570 14.512 < 2e-16 ***
## race_white 0.240438 0.025850 9.301 < 2e-16 ***
## race_hispanic 0.095303 0.030982 3.076 0.0021 **
## race_asian 0.101744 0.085438 1.191 0.2337
## race_native 0.242682 0.109061 2.225 0.0261 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5623 on 9029 degrees of freedom
## Multiple R-squared: 0.4342, Adjusted R-squared: 0.4334
## F-statistic: 577.3 on 12 and 9029 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 8
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "1"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage
res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("COMPAS raw score remainder") +
ylab("xgboost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(1123)
mdl_rf = randomForest(
formula = p_recid_raw_noage ~ .,
data = train
)
res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP
mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## [1] "Best parameters:"
## 12
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.07142857"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 1 | 0.5756646 | 0.5228469 | 0.5553384 | 0.5295423 |
| 2 | 0.5653175 | 0.5128263 | 0.5276490 | 0.5213844 |
| 3 | 0.5728047 | 0.5171593 | 0.5330514 | 0.5260242 |
| 4 | 0.5624647 | 0.5058171 | 0.5244834 | 0.5146050 |
| 5 | 0.5619167 | 0.4971066 | 0.5141074 | 0.5097631 |